#!/usr/bin/env python3
# Author: Armit
# Create Time: 周二 2025/07/29

# 训练集预下载 (8MB/s的速度需要~2h)
# ⚠ 下载原wav文件60G，预处理arrow文件60G，供需120G存储空间!!!
# https://huggingface.co/blog/audio-datasets#fleurs-2

from datasets import load_dataset, Split

from utils import LANG_TO_LANGUAGE


# NOTE: 无法做到仅下载指定split，会自动下载所有split，然后可以任意加载指定split :(
for lang in LANG_TO_LANGUAGE:
  print(f'>> Download {lang}...')
  dataset = load_dataset('google/fleurs', lang, split=Split.TRAIN, trust_remote_code=True)
  print(f' train: {len(dataset)}')
  dataset = load_dataset('google/fleurs', lang, split=Split.VALIDATION, trust_remote_code=True)
  print(f' val:   {len(dataset)}')
  dataset = load_dataset('google/fleurs', lang, split=Split.TEST, trust_remote_code=True)
  print(f' test:  {len(dataset)}')
print('>> Done!')
